{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4204c7e1",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import json\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5e807f09",
   "metadata": {},
   "outputs": [],
   "source": [
    "# create folder for each dataset first    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "09845339",
   "metadata": {},
   "outputs": [],
   "source": [
    "def save_json(content, save_path):\n",
    "    with open(save_path, 'w') as f:\n",
    "        f.write(json.dumps(content))\n",
    "def load_jsonl(filename):\n",
    "    with open(filename, \"r\") as f:\n",
    "        return [json.loads(l.strip(\"\\n\")) for l in f.readlines()]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2edfddc1",
   "metadata": {},
   "outputs": [],
   "source": [
    "# nextqa"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9d49722d",
   "metadata": {},
   "outputs": [],
   "source": [
    "raw_train_csv = 'train.csv'\n",
    "raw_val_csv = 'val.csv'\n",
    "raw_train = pd.read_csv(raw_train_csv, delimiter=',')\n",
    "raw_val = pd.read_csv(raw_val_csv, delimiter=',')\n",
    "train = []\n",
    "val = []\n",
    "key = ['video', 'question', 'a0', 'a1', 'a2', 'a3', 'a4', 'answer', 'qid', 'type'] \n",
    "for i in range(len(raw_train)):\n",
    "    data = {}\n",
    "    for k in key:\n",
    "        data[k] = raw_train.iloc[i][k]\n",
    "    train.append(data)\n",
    "\n",
    "for i in range(len(raw_val)):\n",
    "    data = {}\n",
    "    for k in key:\n",
    "        data[k] = raw_val.iloc[i][k]\n",
    "    val.append(data) \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "943e9523",
   "metadata": {},
   "outputs": [],
   "source": [
    "vid_map = json.load(open('map_vid_vidorID.json'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "afe73814",
   "metadata": {},
   "outputs": [],
   "source": [
    "new_train = []\n",
    "new_val = []\n",
    "for qa in train:\n",
    "    qa_dict = {}\n",
    "    qa_dict['video'] = vid_map[str(qa['video'])]\n",
    "    qa_dict['num_option'] = 5\n",
    "    qa_dict['qid'] = '_'.join([qa['type'], str(qa['video']), str(qa['qid'])])\n",
    "    for i in range(5):\n",
    "        qa_dict['a{}'.format(str(i))] = qa['a{}'.format(str(i))]+'.'\n",
    "    qa_dict['answer'] = qa['answer']\n",
    "    qa_dict['question'] = qa['question']+'?'\n",
    "    new_train.append(qa_dict)\n",
    "\n",
    "for qa in val:\n",
    "    qa_dict = {}\n",
    "    qa_dict['video'] = vid_map[str(qa['video'])]\n",
    "    qa_dict['num_option'] = 5\n",
    "    qa_dict['qid'] = '_'.join([qa['type'], str(qa['video']), str(qa['qid'])])\n",
    "    for i in range(5):\n",
    "        qa_dict['a{}'.format(str(i))] = qa['a{}'.format(str(i))]+'.'\n",
    "    qa_dict['answer'] = qa['answer']\n",
    "    qa_dict['question'] = qa['question']+'?'\n",
    "    new_val.append(qa_dict)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "218a75d3",
   "metadata": {},
   "outputs": [],
   "source": [
    "save_json(new_train, 'nextqa/train.json')\n",
    "save_json(new_val, 'nextqa/val.json')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "67638a5e",
   "metadata": {},
   "outputs": [],
   "source": [
    "# STAR"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fed28d5a",
   "metadata": {},
   "outputs": [],
   "source": [
    "train_path = 'STAR_train.json'\n",
    "val_path = 'STAR_val.json'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "71918325",
   "metadata": {},
   "outputs": [],
   "source": [
    "train = json.load(open(train_path))\n",
    "val = json.load(open(val_path))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "209c3b2a",
   "metadata": {},
   "outputs": [],
   "source": [
    "new_train = []\n",
    "new_val = []\n",
    "for qa in train:\n",
    "    qa_dict = {}\n",
    "    qa_dict['video'] = qa['video_id']\n",
    "    qa_dict['num_option'] = 4\n",
    "    qa_dict['qid'] = qa['question_id']\n",
    "    for i, choice in enumerate(qa['choices']):\n",
    "        qa_dict['a{}'.format(str(i))] = choice['choice']\n",
    "        if choice['choice'] == qa['answer']:\n",
    "            answer = i\n",
    "    qa_dict['answer'] = answer\n",
    "    qa_dict['question'] = qa['question']\n",
    "    qa_dict['start'] = qa['start']\n",
    "    qa_dict['end'] = qa['end']\n",
    "    new_train.append(qa_dict)\n",
    "\n",
    "for qa in val:\n",
    "    qa_dict = {}\n",
    "    qa_dict['video'] = qa['video_id']\n",
    "    qa_dict['num_option'] = 4\n",
    "    qa_dict['qid'] = qa['question_id']\n",
    "    for i, choice in enumerate(qa['choices']):\n",
    "        qa_dict['a{}'.format(str(i))] = choice['choice']\n",
    "        if choice['choice'] == qa['answer']:\n",
    "            answer = i\n",
    "    qa_dict['answer'] = answer\n",
    "    qa_dict['question'] = qa['question']\n",
    "    qa_dict['start'] = qa['start']\n",
    "    qa_dict['end'] = qa['end']\n",
    "    new_val.append(qa_dict)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e6ced28c",
   "metadata": {},
   "outputs": [],
   "source": [
    "save_json(new_train, 'star/train.json')\n",
    "save_json(new_val, 'star/val.json')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c72d66f0",
   "metadata": {},
   "outputs": [],
   "source": [
    "# How2QA"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9ab388e9",
   "metadata": {},
   "outputs": [],
   "source": [
    "train_path = 'how2qa_train_release.jsonl'\n",
    "val_path = 'how2qa_val_release.jsonl'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "81bdadb9",
   "metadata": {},
   "outputs": [],
   "source": [
    "train = load_jsonl(train_path)\n",
    "val = load_jsonl(val_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5164d95e",
   "metadata": {},
   "outputs": [],
   "source": [
    "new_train = []\n",
    "new_val = []\n",
    "for i, qa in enumerate(train):\n",
    "    qa_dict = {}\n",
    "    qa_dict['video'] = qa['vid_name']\n",
    "    qa_dict['num_option'] = 4\n",
    "    qa_dict['qid'] = 'HOW2QA_' + str(i)\n",
    "    for j in range(4):\n",
    "        qa_dict['a{}'.format(str(j))] = qa['a{}'.format(str(j))]\n",
    "        \n",
    "    qa_dict['answer'] = qa['answer_idx']\n",
    "    qa_dict['question'] = qa['q']\n",
    "    qa_dict['start'] = qa['ts'].split('-')[0]\n",
    "    qa_dict['end'] = qa['ts'].split('-')[1]\n",
    "        \n",
    "    new_train.append(qa_dict)\n",
    "\n",
    "for i, qa in enumerate(val):\n",
    "    qa_dict = {}\n",
    "    qa_dict['video'] = qa['vid_name']\n",
    "    qa_dict['num_option'] = 4\n",
    "    qa_dict['qid'] = 'HOW2QA_' + str(i)\n",
    "    for j in range(4):\n",
    "        qa_dict['a{}'.format(str(j))] = qa['a{}'.format(str(j))]\n",
    "        \n",
    "    qa_dict['answer'] = qa['answer_idx']\n",
    "    qa_dict['question'] = qa['q']\n",
    "    qa_dict['start'] = qa['ts'].split('-')[0]\n",
    "    qa_dict['end'] = qa['ts'].split('-')[1]\n",
    "        \n",
    "    new_val.append(qa_dict)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2daa2a4a",
   "metadata": {},
   "outputs": [],
   "source": [
    "save_json(new_train, 'how2qa/train.json')\n",
    "save_json(new_val, 'how2qa/val.json')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4e569c5b",
   "metadata": {},
   "outputs": [],
   "source": [
    "# TVQA"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "319d0fb5",
   "metadata": {},
   "outputs": [],
   "source": [
    "train_path = 'tvqa_train.jsonl'\n",
    "val_path = 'tvqa_val.jsonl'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e0f498f5",
   "metadata": {},
   "outputs": [],
   "source": [
    "train = load_jsonl(train_path)\n",
    "val = load_jsonl(val_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7c24578a",
   "metadata": {},
   "outputs": [],
   "source": [
    "new_train = []\n",
    "new_val = []\n",
    "\n",
    "for i, qa in enumerate(train):\n",
    "    qa_dict = {}\n",
    "    qa_dict['video'] = qa['vid_name']\n",
    "    qa_dict['num_option'] = 5\n",
    "    qa_dict['qid'] = 'TVQA_' + str(i)\n",
    "    for j in range(5):\n",
    "        qa_dict['a{}'.format(str(j))] = qa['a{}'.format(str(j))]\n",
    "    qa_dict['answer'] = qa['answer_idx']\n",
    "    qa_dict['question'] = qa['q']\n",
    "    qa_dict['start'] = qa['ts'].split('-')[0]\n",
    "    qa_dict['end'] = qa['ts'].split('-')[1]\n",
    "        \n",
    "    new_train.append(qa_dict)\n",
    "\n",
    "for i, qa in enumerate(val):\n",
    "    qa_dict = {}\n",
    "    qa_dict['video'] = qa['vid_name']\n",
    "    qa_dict['num_option'] = 5\n",
    "    qa_dict['qid'] = 'TVQA_' + str(i)\n",
    "    for j in range(5):\n",
    "        qa_dict['a{}'.format(str(j))] = qa['a{}'.format(str(j))]\n",
    "    qa_dict['answer'] = qa['answer_idx']\n",
    "    qa_dict['question'] = qa['q']\n",
    "    qa_dict['start'] = qa['ts'].split('-')[0]\n",
    "    qa_dict['end'] = qa['ts'].split('-')[1]\n",
    "        \n",
    "    new_val.append(qa_dict)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "348cafde",
   "metadata": {},
   "outputs": [],
   "source": [
    "save_json(new_train, 'tvqa/train.json')\n",
    "save_json(new_val, 'tvqa/val.json')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "52259cd2",
   "metadata": {},
   "outputs": [],
   "source": [
    "# VLPE"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "53646ebf",
   "metadata": {},
   "outputs": [],
   "source": [
    "train_path = 'vlep_train_release.jsonl'\n",
    "val_path = 'vlep_dev_release.jsonl'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ff92c404",
   "metadata": {},
   "outputs": [],
   "source": [
    "train = load_jsonl(train_path)\n",
    "val = load_jsonl(val_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "dd62a11e",
   "metadata": {},
   "outputs": [],
   "source": [
    "new_train = []\n",
    "new_val = []\n",
    "\n",
    "for i, qa in enumerate(train):\n",
    "    qa_dict = {}\n",
    "    qa_dict['video'] = qa['vid_name']\n",
    "    qa_dict['num_option'] = 2\n",
    "    qa_dict['qid'] = 'VLEP_' + str(qa['example_id'])\n",
    "\n",
    "    for j in range(2):\n",
    "        qa_dict['a{}'.format(str(j))] = qa['events'][j]\n",
    "    qa_dict['answer'] = qa['answer']\n",
    "    # qa_dict['question'] = qa['q']\n",
    "    qa_dict['start'] = qa['ts'][0]\n",
    "    qa_dict['end'] = qa['ts'][1]\n",
    "    \n",
    "    new_train.append(qa_dict)\n",
    "\n",
    "for i, qa in enumerate(val):\n",
    "    qa_dict = {}\n",
    "    qa_dict['video'] = qa['vid_name']\n",
    "    qa_dict['num_option'] = 2\n",
    "    qa_dict['qid'] = 'VLEP_' + str(qa['example_id'])\n",
    "\n",
    "    for j in range(2):\n",
    "        qa_dict['a{}'.format(str(j))] = qa['events'][j]\n",
    "    qa_dict['answer'] = qa['answer']\n",
    "    # qa_dict['question'] = qa['q']\n",
    "    qa_dict['start'] = qa['ts'][0]\n",
    "    qa_dict['end'] = qa['ts'][1]\n",
    "        \n",
    "    new_val.append(qa_dict)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "186de10a",
   "metadata": {},
   "outputs": [],
   "source": [
    "save_json(new_train, 'vlep/train.json')\n",
    "save_json(new_val, 'vlep/val.json')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f7083bf5",
   "metadata": {},
   "outputs": [],
   "source": [
    "# qvh"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1a00480e",
   "metadata": {},
   "outputs": [],
   "source": [
    "train_path = 'highlight_train_release.jsonl'\n",
    "val_path = 'highlight_val_release.jsonl'\n",
    "test_path = 'highlight_test_release.jsonl'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8cc2f260",
   "metadata": {},
   "outputs": [],
   "source": [
    "train = load_jsonl(train_path)\n",
    "val = load_jsonl(val_path)\n",
    "test = load_jsonl(test_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "507365fc",
   "metadata": {},
   "outputs": [],
   "source": [
    "new_train = []\n",
    "new_val = []\n",
    "new_test = []\n",
    "for i, qa in enumerate(train):\n",
    "    qa_dict = {}\n",
    "    qa_dict['video'] = qa['vid']\n",
    "    qa_dict['qid'] = 'QVHighlight_' + str(qa['qid'])\n",
    "    qa_dict['query'] = qa['query']\n",
    "    qa_dict['duration'] = qa['duration']\n",
    "    qa_dict['relevant_windows'] = qa['relevant_windows']\n",
    "    new_train.append(qa_dict)\n",
    "\n",
    "for i, qa in enumerate(val):\n",
    "    qa_dict = {}\n",
    "    qa_dict['video'] = qa['vid']\n",
    "    qa_dict['qid'] = 'QVHighlight_' + str(qa['qid'])\n",
    "    qa_dict['query'] = qa['query']\n",
    "    qa_dict['duration'] = qa['duration']\n",
    "    qa_dict['relevant_windows'] = qa['relevant_windows']\n",
    "    new_val.append(qa_dict)\n",
    "\n",
    "for i, qa in enumerate(test):\n",
    "    qa_dict = {}\n",
    "    qa_dict['video'] = qa['vid']\n",
    "    qa_dict['qid'] = 'QVHighlight_' + str(qa['qid'])\n",
    "    qa_dict['query'] = qa['query']\n",
    "    qa_dict['duration'] = qa['duration']\n",
    "    new_test.append(qa_dict)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0f9754fc",
   "metadata": {},
   "outputs": [],
   "source": [
    "save_json(new_train, 'qvh/train.json')\n",
    "save_json(new_val, 'qvh/val.json')\n",
    "save_json(new_test, 'qvh/test.json')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
